<<<<<<< HEAD practical_2
#install.packages(c("readr", "dplyr", "tidyr", "ggplot2", "tidtext", "textstem", "clinspacy", "topicmodels", "reshape2"), dependencies = TRUE)

knitr::opts_chunk$set(echo = TRUE)
library(readr)
## Warning: package 'readr' was built under R version 4.3.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(scales)
## Warning: package 'scales' was built under R version 4.3.2
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.3
library(textstem)
## Warning: package 'textstem' was built under R version 4.3.3
## Loading required package: koRpus.lang.en
## Warning: package 'koRpus.lang.en' was built under R version 4.3.3
## Loading required package: koRpus
## Warning: package 'koRpus' was built under R version 4.3.3
## Loading required package: sylly
## Warning: package 'sylly' was built under R version 4.3.3
## For information on available language packages for 'koRpus', run
## 
##   available.koRpus.lang()
## 
## and see ?install.koRpus.lang()
## 
## Attaching package: 'koRpus'
## The following object is masked from 'package:readr':
## 
##     tokenize
library(clinspacy)
## Warning: package 'clinspacy' was built under R version 4.3.3
## Welcome to clinspacy.
## By default, this package will install and use miniconda and create a "clinspacy" conda environment.
## If you want to override this behavior, use clinspacy_init(miniconda = FALSE) and specify an alternative environment using reticulate::use_python() or reticulate::use_conda().
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 4.3.3
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.2
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(stringr)
## Warning: package 'stringr' was built under R version 4.3.2

Data Parsing

# load dataset from clinSpaCy
raw.data <- clinspacy::dataset_mtsamples()

# a fast glimpse on the columns contained in the dataset
dplyr::glimpse(raw.data)
## Rows: 4,999
## Columns: 6
## $ note_id           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ description       <chr> "A 23-year-old white female presents with complaint …
## $ medical_specialty <chr> "Allergy / Immunology", "Bariatrics", "Bariatrics", …
## $ sample_name       <chr> "Allergic Rhinitis", "Laparoscopic Gastric Bypass Co…
## $ transcription     <chr> "SUBJECTIVE:,  This 23-year-old white female present…
## $ keywords          <chr> "allergy / immunology, allergic rhinitis, allergies,…
cat(c("",""), sep="\n\n")
# variable type for each column
sapply(raw.data, class)
##           note_id       description medical_specialty       sample_name 
##         "integer"       "character"       "character"       "character" 
##     transcription          keywords 
##       "character"       "character"
cat(c("",""), sep="\n\n")
# check number of levels in "medical_specialty"
cat(c("Number of medical specialties = ", raw.data %>% dplyr::select(medical_specialty) %>% dplyr::n_distinct()), sep=" ")
## Number of medical specialties =  40

Q1) Using the output of dplyr’s glimpse command (or rstudio’s data viewer by clicking on raw.data in the Environment pane) provide a description of what you think each variable in this dataset contains.

Based on the manual inspection and the printed outputs above, we can summarise the variables as in the following table.


Feature           | Provided Information                                                        | Type                 |
------------------| ----------------------------------------------------------------------------| ---------------------|
note_id:          | Unique identifier for each medical note                                     | Integer (identifier) |
description:      | Text description about the observed symptoms of the patient                 | Text/String          |
medical_sepcialty:| Classified main category or branch of medicine for the diagnosed symptoms   | Factor (nominal)     |
sample_name:      | Sub-category stating the specific details of the diagnosed symptoms         | Factor (nominal)     |
transcription:    | Full text of the medical notes showing all observations and medical details | Text/String          |
keywords:         | Extracted important clinical words or entities related to this visit        | Text/String          |
#  number of transcripts are there from each specialty
ggplot2::ggplot(raw.data, ggplot2::aes(y=medical_specialty)) + ggplot2::geom_bar() + labs(x="Document Count", y="Medical Speciality")

#  filter specific specialties for analysis
filtered.data <- raw.data %>% dplyr::filter(medical_specialty %in% c("Orthopedic", "Radiology", "Surgery")) 

Text Processing

analysis.data <- filtered.data %>%
  unnest_tokens(word, transcription) %>%
  
  ## remove all numbers
  mutate(word = str_replace_all(word, "[^[:alnum:]]", "")) %>%
  filter(!str_detect(word, "[0-9]")) %>%
  
  ## remove stop words
  ##  based on `tidytext::stop_words` and `dplyr::anti_join()`
  anti_join(stop_words) %>%
  
  ## per each note, combine the token as a string, replace the original 'transcription' column
  group_by(note_id) %>%
  summarise(transcription = paste(word, collapse = " ")) %>%
  left_join(select(filtered.data, -transcription), by = "note_id")
## Joining with `by = join_by(word)`
#  uni-gram
tokenized.data.unigram <- analysis.data %>% tidytext::unnest_tokens(word, transcription, to_lower=TRUE)
#  bi-gram
tokenized.data.bigram <- analysis.data %>% tidytext::unnest_tokens(ngram, transcription, token = "ngrams", n=2, to_lower = TRUE)
#  number of stop words in `tidytext::stop_words`
tidytext::stop_words %>% dplyr::group_by(lexicon) %>% dplyr::distinct(word) %>% dplyr::summarise(n=dplyr::n())
## # A tibble: 3 × 2
##   lexicon      n
##   <chr>    <int>
## 1 SMART      570
## 2 onix       398
## 3 snowball   174

Q2) How many unique unigrams are there in the transcripts from each specialty?

#  number of unique uni-grams in the dataset (stop words and numbers removed in the previous pre-processing step)

tokenized.data.unigram %>% dplyr::group_by(medical_specialty) %>% dplyr::summarise(n_unigrams = n_distinct(word))
## # A tibble: 3 × 2
##   medical_specialty n_unigrams
##   <chr>                  <int>
## 1 Orthopedic              7681
## 2 Radiology               5933
## 3 Surgery                11977
#  plot some distribution of uni-gram tokens

word_counts <- tokenized.data.unigram %>%
    group_by(word) %>%
    summarise(count = n()) %>%
    ungroup() %>%
    arrange(desc(count))

count_distribution <- word_counts %>%
  group_by(count) %>%
  summarise(num_words = n()) %>%
  ungroup()
 
 ggplot2::ggplot(count_distribution, aes(x = count, y = num_words)) +
  geom_point() +
  labs(title = "Scatter Plot of Count Distribution",
       x = "Count of Unique Words",
       y = "Number of Words")

#  plot some distribution of bi-gram tokens

word_counts <- tokenized.data.bigram %>%
    group_by(ngram) %>%
    summarise(count = n()) %>%
    ungroup() %>%
    arrange(desc(count))

count_distribution <- word_counts %>%
  group_by(count) %>%
  summarise(num_words = n()) %>%
  ungroup()
 
 ggplot2::ggplot(count_distribution, aes(x = count, y = num_words)) +
  geom_point() +
  labs(title = "Scatter Plot of Count Distribution",
       x = "Count of Unique Bigrams",
       y = "Number of Words")

Q3) How many unique bi-grams are there in each category without stop words and numbers?

#  number of unique bi-grams in the dataset 
#  (stop words and numbers removed in the previous pre-processing step)

tokenized.data.bigram %>% dplyr::group_by(medical_specialty) %>% dplyr::summarise(n_bigrams = n_distinct(ngram))
## # A tibble: 3 × 2
##   medical_specialty n_bigrams
##   <chr>                 <int>
## 1 Orthopedic            55730
## 2 Radiology             28294
## 3 Surgery              130404

Q4) How many unique sentences are there in each category?

#  get a sentence splitting result
tokenized.data.senstence = filtered.data %>% 
  unnest_tokens(output = sentence, transcription, token = "sentences") %>%
  mutate(sentence = str_replace_all(sentence, "\\d*\\.", ""))

#  number of unique sentences per each specialty
tokenized.data.senstence %>% dplyr::group_by(medical_specialty) %>% dplyr::summarise(n_sentences = n_distinct(sentence))
## # A tibble: 3 × 2
##   medical_specialty n_sentences
##   <chr>                   <int>
## 1 Orthopedic              11162
## 2 Radiology                4550
## 3 Surgery                 29595
tokenized.data = tokenized.data.bigram

#  get the top 5 frequent bi-grams per specialty
tokenized.data %>%
  dplyr::group_by(medical_specialty) %>%
  dplyr::count(ngram, sort = TRUE) %>%
  dplyr::top_n(5)
## Selecting by n
## # A tibble: 16 × 3
## # Groups:   medical_specialty [3]
##    medical_specialty ngram                       n
##    <chr>             <chr>                   <int>
##  1 Surgery           prepped draped            696
##  2 Surgery           preoperative diagnosis    555
##  3 Surgery           procedure patient         551
##  4 Surgery           postoperative diagnosis   518
##  5 Surgery           tolerated procedure       515
##  6 Orthopedic        prepped draped            183
##  7 Orthopedic        preoperative diagnosis    141
##  8 Orthopedic        lower extremity           139
##  9 Orthopedic        range motion              139
## 10 Orthopedic        postoperative diagnosis   124
## 11 Radiology         carotid artery             59
## 12 Radiology         heart rate                 52
## 13 Radiology         reason exam                51
## 14 Radiology         left ventricular           50
## 15 Radiology         coronary artery            43
## 16 Radiology         exam unremarkable          43

Q5) Do you think a general purpose lemmatizer will work well for medical data? Why might it not?

A general-use lemmatizer is unlikely to work well on medical texts, because of the following two reasons:

  1. General-use lemmatizer is trained on daily and commonly used English words, and may incorrectly normalize medical domain-specific technical terms or unable to detect their roots, as they are almost unseen in laymen texts.

  2. The affixation, parts of speech, or any variant form of the base word may convey different semantic meanings that are common and important for medical contexts, one example is “pre-” and “post-”.

# apply lemmatizer

lemmatized.data <- tokenized.data %>% dplyr::mutate(lemma=textstem::lemmatize_words(ngram))
#  calculate the frequency of lemmas within each specialty and note
##   comparison on "Orthopedic" class

lemma.freq <- lemmatized.data %>% 
  dplyr::count(medical_specialty, lemma) %>%
  dplyr::group_by(medical_specialty) %>% 
  dplyr::mutate(proportion = n / sum(n)) %>%
  tidyr::pivot_wider(names_from = medical_specialty, values_from = proportion) %>%
  tidyr::pivot_longer(`Surgery`:`Radiology`,
               names_to = "medical_specialty", values_to = "proportion")
#  plot the relative proportions

ggplot2::ggplot(lemma.freq, ggplot2::aes(x=proportion, 
                                         y=`Orthopedic`,
                                         color=abs(`Orthopedic` - proportion))) + 
  ggplot2::geom_abline(color="gray40", lty=2) +
  ggplot2::geom_jitter(alpha=0.1, size=2.5, width=0.3, height=0.3) +
  ggplot2::geom_text(ggplot2::aes(label=lemma), check_overlap=TRUE, vjust=1.5) +
  ggplot2::scale_x_log10(labels=scales::percent_format()) + 
  ggplot2::scale_y_log10(labels=scales::percent_format()) + 
  ggplot2::scale_color_gradient(limits=c(0, 0.001), low="darkslategray4", high="gray75") +
  ggplot2::facet_wrap(~medical_specialty, ncol = 2) +
  ggplot2::theme(legend.position="none") +
  ggplot2:: labs(y="Orthopedic", x = NULL)
## Warning: Removed 314400 rows containing missing values (`geom_point()`).
## Warning: Removed 314400 rows containing missing values (`geom_text()`).

Q6) What does this plot tell you about the relative similarity of lemma frequencies between Surgery and Orthopedic and between Radiology and Orthopedic? Based on what these specialties involve, is this what you would expect?

These two plots compare the base-10 log-transformed relative proportion (%) of frequencies for each lemma existing in Orthopedic notes against Surgery and Radiology notes respectively. The more the number of terms distributed along the dashed diagonal line suggests a higher similarity of the lemmas used in the notes for both specialties.

Based on the comparison between Surgery and Orthopedic, the majority of shared lemmas are distributed above the diagonal line. This means surgery-required lemmas in orthopedics take up only a small part of all surgical operations, and most of these are related to fracture and ligament, which is intuitive and easily interpretable.

For comparison between Radiology and Orthopedic, the majority of shared lemmas are distributed under the diagonal line. This means radiological lemmas are less common than surgical lemmas in orthopedic clinics, and most common situations involve imaging. Somehow, it might be a bit out of expectation because imaging seems to be a common tool for inspecting the problems in musculoskeletal structure and monitoring recovery process.

Q7) Modify the above plotting code to do a direct comparison of Surgery and Radiology (i.e., have Surgery or Radiology on the Y-axis and the other 2 specialties as the X facets)

#  calculate the frequency of lemmas within each specialty and note
##   comparison on "Surgery" class

lemma.freq.2 <- lemmatized.data %>% 
  dplyr::count(medical_specialty, lemma) %>%
  dplyr::group_by(medical_specialty) %>% 
  dplyr::mutate(proportion = n / sum(n)) %>%
  tidyr::pivot_wider(names_from = medical_specialty, values_from = proportion) %>%
  tidyr::pivot_longer(`Orthopedic`:`Radiology`,
                      names_to = "medical_specialty", values_to = "proportion")

ggplot2::ggplot(lemma.freq.2, ggplot2::aes(x=proportion, 
                                           y=`Surgery`,
                                           color=abs(`Surgery` - proportion))) + 
  ggplot2::geom_abline(color="gray40", lty=2) +
  ggplot2::geom_jitter(alpha=0.1, size=2.5, width=0.3, height=0.3) +
  ggplot2::geom_text(ggplot2::aes(label=lemma), check_overlap=TRUE, vjust=1.5) +
  ggplot2::scale_x_log10(labels=scales::percent_format()) + 
  ggplot2::scale_y_log10(labels=scales::percent_format()) + 
  ggplot2::scale_color_gradient(limits=c(0, 0.001), low="darkslategray4", high="gray75") +
  ggplot2::facet_wrap(~medical_specialty, ncol = 2) +
  ggplot2::theme(legend.position="none") +
  ggplot2:: labs(y="Surgery", x = NULL)
## Warning: Removed 317813 rows containing missing values (`geom_point()`).
## Warning: Removed 317814 rows containing missing values (`geom_text()`).

#  calculate the frequency of lemmas within each specialty and note
##   comparison on "Radiology" class

lemma.freq.3 <- lemmatized.data %>% 
  dplyr::count(medical_specialty, lemma) %>%
  dplyr::group_by(medical_specialty) %>% 
  dplyr::mutate(proportion = n / sum(n)) %>%
  tidyr::pivot_wider(names_from = medical_specialty, values_from = proportion) %>%
  tidyr::pivot_longer(c(`Orthopedic`,`Surgery`),
                      names_to = "medical_specialty", values_to = "proportion")

ggplot2::ggplot(lemma.freq.3, ggplot2::aes(x=proportion, 
                                           y=`Radiology`,
                                           color=abs(`Radiology` - proportion))) + 
  ggplot2::geom_abline(color="gray40", lty=2) +
  ggplot2::geom_jitter(alpha=0.1, size=2.5, width=0.3, height=0.3) +
  ggplot2::geom_text(ggplot2::aes(label=lemma), check_overlap=TRUE, vjust=1.5) +
  ggplot2::scale_x_log10(labels=scales::percent_format()) + 
  ggplot2::scale_y_log10(labels=scales::percent_format()) + 
  ggplot2::scale_color_gradient(limits=c(0, 0.001), low="darkslategray4", high="gray75") +
  ggplot2::facet_wrap(~medical_specialty, ncol = 2) +
  ggplot2::theme(legend.position="none") +
  ggplot2:: labs(y="Radiology", x = NULL)
## Warning: Removed 343441 rows containing missing values (`geom_point()`).
## Warning: Removed 343442 rows containing missing values (`geom_text()`).

TF-IDF Normalisation

##  obtain counts of terms

lemma.counts <- lemmatized.data %>% dplyr::count(medical_specialty, lemma)
total.counts <- lemma.counts %>% 
                      dplyr::group_by(medical_specialty) %>% 
                      dplyr::summarise(total=sum(n))

all.counts <- dplyr::left_join(lemma.counts, total.counts)
## Joining with `by = join_by(medical_specialty)`
##  calculate the term frequency / invariant document frequency (tf-idf)

all.counts.tfidf <- tidytext::bind_tf_idf(all.counts, lemma, medical_specialty, n) 
##  print top 10 lemma by tf-idf within each specialty

all.counts.tfidf %>% dplyr::group_by(medical_specialty) %>% dplyr::slice_max(order_by=tf_idf, n=10)
## # A tibble: 30 × 7
## # Groups:   medical_specialty [3]
##    medical_specialty lemma                     n total       tf   idf   tf_idf
##    <chr>             <chr>                 <int> <int>    <dbl> <dbl>    <dbl>
##  1 Orthopedic        range motion            139 97774 0.00142  0.405 0.000576
##  2 Orthopedic        carpal ligament          85 97774 0.000869 0.405 0.000352
##  3 Orthopedic        transverse carpal        81 97774 0.000828 0.405 0.000336
##  4 Orthopedic        extremity prepped        79 97774 0.000808 0.405 0.000328
##  5 Orthopedic        proximal phalanx         75 97774 0.000767 0.405 0.000311
##  6 Orthopedic        department anesthesia    63 97774 0.000644 0.405 0.000261
##  7 Orthopedic        dissection carried       63 97774 0.000644 0.405 0.000261
##  8 Orthopedic        steri strips             59 97774 0.000603 0.405 0.000245
##  9 Orthopedic        closed vicryl            58 97774 0.000593 0.405 0.000241
## 10 Orthopedic        dressing applied         58 97774 0.000593 0.405 0.000241
## # ℹ 20 more rows

Q8) Are there any lemmas that stand out in these lists? Why or why not?

Generally, there is no common lemma that particularly has a significantly higher TF-IDF score than other lemmas in all 3 specialties, because the list of top lemmas in Radiology is quite different from the remaining two specialties. For Surgery and Orthopedics, “steri strips” and “closed vicryl” appear among the top 10 lemmas for both categories, as well as some bi-grams containing the word “anesthesia”.

These words are common tools or techniques that will be adopted when doing a surgery, involving those orthopedic-related surgery operations. From below, each of the two terms appear in around 200 transcriptions, and commonly mentioned in 60 notes.

##  Extract the full transcriptions from notes containing both terms "steri strips" and "closed vicryl"

a = analysis.data %>% dplyr::select(medical_specialty, transcription) %>% dplyr::filter(stringr::str_detect(transcription, 'steri strips'))
b = analysis.data %>% dplyr::select(medical_specialty, transcription) %>% dplyr::filter(stringr::str_detect(transcription, 'closed vicryl')) 

print(dim(a)[1])
## [1] 207
print(dim(b)[1])
## [1] 201
print(dim(subset(a, transcription %in% b$transcription))[1])
## [1] 60
##  Extract 1 example of the full transcriptions from notes containing the term "steri strips"

analysis.data %>% dplyr::select(medical_specialty, transcription) %>% dplyr::filter(stringr::str_detect(transcription, 'steri strips'))  %>% dplyr::slice(1)
## # A tibble: 1 × 2
##   medical_specialty transcription                                               
##   <chr>             <chr>                                                       
## 1 Surgery           preoperative diagnoses hallux rigidus left foot elevated me…

Q9) Extract an example of one of the other “top lemmas” by modifying the above code.

##  Extract 1 example of the full transcriptions from notes containing the term "closed vicryl"

analysis.data %>% dplyr::select(medical_specialty, transcription) %>% dplyr::filter(stringr::str_detect(transcription, 'closed vicryl'))  %>% dplyr::slice(1)
## # A tibble: 1 × 2
##   medical_specialty transcription                                               
##   <chr>             <chr>                                                       
## 1 Surgery           title operation placement ventriculoperitoneal vp shunts st…

Topic Modelling

First lets calculate a term frequency matrix for each transcription:

lemma.counts <- lemmatized.data %>% dplyr::count(note_id, lemma)
total.counts <- lemma.counts %>% 
                      dplyr::group_by(note_id) %>% 
                      dplyr::summarise(total=sum(n))

all.counts <- dplyr::left_join(lemma.counts, total.counts)
## Joining with `by = join_by(note_id)`
emr.dcm <- all.counts %>% tidytext::cast_dtm(note_id, lemma, n)

Then we can use LDA function to fit a 5 topic (k=5) LDA-model:

emr.lda <- topicmodels::LDA(emr.dcm, k=5, control=list(seed=42))
emr.topics <- tidytext::tidy(emr.lda, matrix='beta')

Then we can extract the top terms per assigned topic:

top.terms <- emr.topics %>% dplyr::group_by(topic) %>% 
  dplyr::slice_max(beta, n=10) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(topic, -beta)


top.terms %>% 
  dplyr::mutate(term=tidytext::reorder_within(term, beta, topic)) %>% 
  ggplot2::ggplot(ggplot2::aes(beta, term, fill=factor(topic))) + 
    ggplot2::geom_col(show.legend=FALSE) + 
    ggplot2::facet_wrap(~ topic, scales='free')  +
    ggplot2::theme(axis.text.x = element_text(angle = 45,vjust = 1,hjust = 1)) +
    tidytext::scale_y_reordered()

specialty_gamma <- tidytext::tidy(emr.lda, matrix='gamma')

# we need to join in the specialty from the note_id
note_id_specialty_mapping <- lemmatized.data %>%
  dplyr::mutate(document=as.character(note_id)) %>% 
  dplyr::select(document, medical_specialty) %>% 
  dplyr::distinct()

specialty_gamma <- dplyr::left_join(specialty_gamma, note_id_specialty_mapping)
## Joining with `by = join_by(document)`
specialty_gamma %>%
  dplyr::mutate(medical_specialty = reorder(medical_specialty, gamma * topic)) %>%
  ggplot2::ggplot(ggplot2::aes(factor(topic), gamma)) +
  ggplot2::geom_boxplot() +
  ggplot2::facet_wrap(~ medical_specialty) +
  ggplot2::labs(x = "topic", y = expression(gamma))

Q10) Repeat this with a 6 topic LDA, do the top terms from the 5 topic LDA still turn up? How do the specialties get split into sub-topics?

Comparing the 6-topic LDA model and the 5-topic LDA model:

  1. The 1st and 3rd topics of both LDA models are basically the same, just the rankings of the top 10 terms have some differences;

  2. The 2nd and 4th topics of both LDA models should be relevant, but some new terms enter the top 10 for the 6-topic LDA model, for example, “proximal phalanx”, “ankle tourniquet”, “metatarsal head” for the 2nd topic, as well as “central canal”, “lower extremity” for the 4th topic.

    This shows the 6-topic LDA model can capture more specific themes or entities for these two topics, while the 5-topic LDA model repeats some high-frequency terms that also appear in other topics, like “toleratedd procedure”, “patient tolerated”.

  3. The 5th topic in both models are similar, just the 6-topic LDA model identifies “vicryl suture” that is missing in the 5-topic LDA model.

  4. The new 6th topic is left body related, with terms like “left main” and “left anterior” regrouped to this topic.

There is a great change in the topic distribution of each specialty. In the 5-topic LDA model, each specialty has a dominant topic (Orthopedics: topic 2, Surgery: topic 3, Radiology: topic 4). However, in the 6-topic LDA model, Orthopedics and Surgery do not show a dependence on a particular topic. While topic 4 is still the major topic for Radiology, its sub-topic changes from topic 1 to the newly formed topic 6.

This observation may be due to the algorithm of LDA, because LDA is a probabilistic model that each note is assumed to be containing a mixture of topics (document-topic distribution) and a topic is made of a mixture of terms (topic-term distribution). The probabilistic distribution is guided by the inputted TF-IDF frequency matrix. Some frequent terms appearing in a majority of notes of the three specialties will have a higher probability of being drawn, which explains why they are top-ranked in each topic. When all topics have some extent of similarities, the document overall will tend to have a relatively equal distribution of topics.

Adding 1 topic may have increased the complexity and perplexity of the model. Although some topics (e.g., 2nd and 4th) become more specific, the topic probabilities for each note may become less variant. Upon aggregation by the three specialties, the topic contribution becomes more even.

D. Buenaño-Fernandez, M. González, D. Gil and S. Luján-Mora, (2020). “Text Mining of Open-Ended Questions in Self-Assessment of University Teachers: An LDA Topic Modeling Approach,” in IEEE Access, vol. 8, pp. 35318-35330, doi: 10.1109/ACCESS.2020.2974983.

Redo LDA function to fit a 6 topic LDA-model:

emr.lda.2 <- topicmodels::LDA(emr.dcm, k=6, control=list(seed=42))
emr.topics.2 <- tidytext::tidy(emr.lda.2, matrix='beta')

Extract the top terms per assigned topic:

top.terms.2 <- emr.topics.2 %>% dplyr::group_by(topic) %>% 
  dplyr::slice_max(beta, n=10) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(topic, -beta)


top.terms.2 %>% 
  dplyr::mutate(term=tidytext::reorder_within(term, beta, topic)) %>% 
  ggplot2::ggplot(ggplot2::aes(beta, term, fill=factor(topic))) + 
    ggplot2::geom_col(show.legend=FALSE) + 
    ggplot2::facet_wrap(~ topic, scales='free')  +
    ggplot2::theme(axis.text.x = element_text(angle = 45,vjust = 1,hjust = 1)) +
    tidytext::scale_y_reordered()

specialty_gamma.2 <- tidytext::tidy(emr.lda.2, matrix='gamma')

# we need to join in the specialty from the note_id
note_id_specialty_mapping.2 <- lemmatized.data %>%
  dplyr::mutate(document=as.character(note_id)) %>% 
  dplyr::select(document, medical_specialty) %>% 
  dplyr::distinct()

specialty_gamma.2 <- dplyr::left_join(specialty_gamma.2, note_id_specialty_mapping.2)
## Joining with `by = join_by(document)`
specialty_gamma.2 %>%
  dplyr::mutate(medical_specialty = reorder(medical_specialty, gamma * topic)) %>%
  ggplot2::ggplot(ggplot2::aes(factor(topic), gamma)) +
  ggplot2::geom_boxplot() +
  ggplot2::facet_wrap(~ medical_specialty) +
  ggplot2::labs(x = "topic", y = expression(gamma))

======= practical_2
#install.packages(c("readr", "dplyr", "tidyr", "ggplot2", "tidtext", "textstem", "clinspacy", "topicmodels", "reshape2"), dependencies = TRUE)

knitr::opts_chunk$set(echo = TRUE)
library(readr)
## Warning: package 'readr' was built under R version 4.3.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(scales)
## Warning: package 'scales' was built under R version 4.3.2
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.3
library(textstem)
## Warning: package 'textstem' was built under R version 4.3.3
## Loading required package: koRpus.lang.en
## Warning: package 'koRpus.lang.en' was built under R version 4.3.3
## Loading required package: koRpus
## Warning: package 'koRpus' was built under R version 4.3.3
## Loading required package: sylly
## Warning: package 'sylly' was built under R version 4.3.3
## For information on available language packages for 'koRpus', run
## 
##   available.koRpus.lang()
## 
## and see ?install.koRpus.lang()
## 
## Attaching package: 'koRpus'
## The following object is masked from 'package:readr':
## 
##     tokenize
library(clinspacy)
## Warning: package 'clinspacy' was built under R version 4.3.3
## Welcome to clinspacy.
## By default, this package will install and use miniconda and create a "clinspacy" conda environment.
## If you want to override this behavior, use clinspacy_init(miniconda = FALSE) and specify an alternative environment using reticulate::use_python() or reticulate::use_conda().
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 4.3.3
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.2
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(stringr)
## Warning: package 'stringr' was built under R version 4.3.2

Data Parsing

# load dataset from clinSpaCy
raw.data <- clinspacy::dataset_mtsamples()

# a fast glimpse on the columns contained in the dataset
dplyr::glimpse(raw.data)
## Rows: 4,999
## Columns: 6
## $ note_id           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ description       <chr> "A 23-year-old white female presents with complaint …
## $ medical_specialty <chr> "Allergy / Immunology", "Bariatrics", "Bariatrics", …
## $ sample_name       <chr> "Allergic Rhinitis", "Laparoscopic Gastric Bypass Co…
## $ transcription     <chr> "SUBJECTIVE:,  This 23-year-old white female present…
## $ keywords          <chr> "allergy / immunology, allergic rhinitis, allergies,…
cat(c("",""), sep="\n\n")
# variable type for each column
sapply(raw.data, class)
##           note_id       description medical_specialty       sample_name 
##         "integer"       "character"       "character"       "character" 
##     transcription          keywords 
##       "character"       "character"
cat(c("",""), sep="\n\n")
# check number of levels in "medical_specialty"
cat(c("Number of medical specialties = ", raw.data %>% dplyr::select(medical_specialty) %>% dplyr::n_distinct()), sep=" ")
## Number of medical specialties =  40

Q1) Using the output of dplyr’s glimpse command (or rstudio’s data viewer by clicking on raw.data in the Environment pane) provide a description of what you think each variable in this dataset contains.

Based on the manual inspection and the printed outputs above, we can summarise the variables as in the following table.


Feature           | Provided Information                                                        | Type                 |
------------------| ----------------------------------------------------------------------------| ---------------------|
note_id:          | Unique identifier for each medical note                                     | Integer (identifier) |
description:      | Text description about the observed symptoms of the patient                 | Text/String          |
medical_sepcialty:| Classified main category or branch of medicine for the diagnosed symptoms   | Factor (nominal)     |
sample_name:      | Sub-category stating the specific details of the diagnosed symptoms         | Factor (nominal)     |
transcription:    | Full text of the medical notes showing all observations and medical details | Text/String          |
keywords:         | Extracted important clinical words or entities related to this visit        | Text/String          |
#  number of transcripts are there from each specialty
ggplot2::ggplot(raw.data, ggplot2::aes(y=medical_specialty)) + ggplot2::geom_bar() + labs(x="Document Count", y="Medical Speciality")

#  filter specific specialties for analysis
filtered.data <- raw.data %>% dplyr::filter(medical_specialty %in% c("Orthopedic", "Radiology", "Surgery")) 

Text Processing

analysis.data <- filtered.data %>%
  unnest_tokens(word, transcription) %>%
  
  ## remove all numbers
  mutate(word = str_replace_all(word, "[^[:alnum:]]", "")) %>%
  filter(!str_detect(word, "[0-9]")) %>%
  
  ## remove stop words
  ##  based on `tidytext::stop_words` and `dplyr::anti_join()`
  anti_join(stop_words) %>%
  
  ## per each note, combine the token as a string, replace the original 'transcription' column
  group_by(note_id) %>%
  summarise(transcription = paste(word, collapse = " ")) %>%
  left_join(select(filtered.data, -transcription), by = "note_id")
## Joining with `by = join_by(word)`
#  uni-gram
tokenized.data.unigram <- analysis.data %>% tidytext::unnest_tokens(word, transcription, to_lower=TRUE)
#  bi-gram
tokenized.data.bigram <- analysis.data %>% tidytext::unnest_tokens(ngram, transcription, token = "ngrams", n=2, to_lower = TRUE)
#  number of stop words in `tidytext::stop_words`
tidytext::stop_words %>% dplyr::group_by(lexicon) %>% dplyr::distinct(word) %>% dplyr::summarise(n=dplyr::n())
## # A tibble: 3 × 2
##   lexicon      n
##   <chr>    <int>
## 1 SMART      570
## 2 onix       398
## 3 snowball   174

Q2) How many unique unigrams are there in the transcripts from each specialty?

#  number of unique uni-grams in the dataset (stop words and numbers removed in the previous pre-processing step)

tokenized.data.unigram %>% dplyr::group_by(medical_specialty) %>% dplyr::summarise(n_unigrams = n_distinct(word))
## # A tibble: 3 × 2
##   medical_specialty n_unigrams
##   <chr>                  <int>
## 1 Orthopedic              7681
## 2 Radiology               5933
## 3 Surgery                11977
#  plot some distribution of uni-gram tokens

word_counts <- tokenized.data.unigram %>%
    group_by(word) %>%
    summarise(count = n()) %>%
    ungroup() %>%
    arrange(desc(count))

count_distribution <- word_counts %>%
  group_by(count) %>%
  summarise(num_words = n()) %>%
  ungroup()
 
 ggplot2::ggplot(count_distribution, aes(x = count, y = num_words)) +
  geom_point() +
  labs(title = "Scatter Plot of Count Distribution",
       x = "Count of Unique Words",
       y = "Number of Words")

#  plot some distribution of bi-gram tokens

word_counts <- tokenized.data.bigram %>%
    group_by(ngram) %>%
    summarise(count = n()) %>%
    ungroup() %>%
    arrange(desc(count))

count_distribution <- word_counts %>%
  group_by(count) %>%
  summarise(num_words = n()) %>%
  ungroup()
 
 ggplot2::ggplot(count_distribution, aes(x = count, y = num_words)) +
  geom_point() +
  labs(title = "Scatter Plot of Count Distribution",
       x = "Count of Unique Bigrams",
       y = "Number of Words")

Q3) How many unique bi-grams are there in each category without stop words and numbers?

#  number of unique bi-grams in the dataset 
#  (stop words and numbers removed in the previous pre-processing step)

tokenized.data.bigram %>% dplyr::group_by(medical_specialty) %>% dplyr::summarise(n_bigrams = n_distinct(ngram))
## # A tibble: 3 × 2
##   medical_specialty n_bigrams
##   <chr>                 <int>
## 1 Orthopedic            55730
## 2 Radiology             28294
## 3 Surgery              130404

Q4) How many unique sentences are there in each category?

#  get a sentence splitting result
tokenized.data.senstence = filtered.data %>% 
  unnest_tokens(output = sentence, transcription, token = "sentences") %>%
  mutate(sentence = str_replace_all(sentence, "\\d*\\.", ""))

#  number of unique sentences per each specialty
tokenized.data.senstence %>% dplyr::group_by(medical_specialty) %>% dplyr::summarise(n_sentences = n_distinct(sentence))
## # A tibble: 3 × 2
##   medical_specialty n_sentences
##   <chr>                   <int>
## 1 Orthopedic              11162
## 2 Radiology                4550
## 3 Surgery                 29595
tokenized.data = tokenized.data.bigram

#  get the top 5 frequent bi-grams per specialty
tokenized.data %>%
  dplyr::group_by(medical_specialty) %>%
  dplyr::count(ngram, sort = TRUE) %>%
  dplyr::top_n(5)
## Selecting by n
## # A tibble: 16 × 3
## # Groups:   medical_specialty [3]
##    medical_specialty ngram                       n
##    <chr>             <chr>                   <int>
##  1 Surgery           prepped draped            696
##  2 Surgery           preoperative diagnosis    555
##  3 Surgery           procedure patient         551
##  4 Surgery           postoperative diagnosis   518
##  5 Surgery           tolerated procedure       515
##  6 Orthopedic        prepped draped            183
##  7 Orthopedic        preoperative diagnosis    141
##  8 Orthopedic        lower extremity           139
##  9 Orthopedic        range motion              139
## 10 Orthopedic        postoperative diagnosis   124
## 11 Radiology         carotid artery             59
## 12 Radiology         heart rate                 52
## 13 Radiology         reason exam                51
## 14 Radiology         left ventricular           50
## 15 Radiology         coronary artery            43
## 16 Radiology         exam unremarkable          43

Q5) Do you think a general purpose lemmatizer will work well for medical data? Why might it not?

A general-use lemmatizer is unlikely to work well on medical texts, because of the following two reasons:

  1. General-use lemmatizer is trained on daily and commonly used English words, and may incorrectly normalize medical domain-specific technical terms or unable to detect their roots, as they are almost unseen in laymen texts.

  2. The affixation, parts of speech, or any variant form of the base word may convey different semantic meanings that are common and important for medical contexts, one example is “pre-” and “post-”.

# apply lemmatizer

lemmatized.data <- tokenized.data %>% dplyr::mutate(lemma=textstem::lemmatize_words(ngram))
#  calculate the frequency of lemmas within each specialty and note
##   comparison on "Orthopedic" class

lemma.freq <- lemmatized.data %>% 
  dplyr::count(medical_specialty, lemma) %>%
  dplyr::group_by(medical_specialty) %>% 
  dplyr::mutate(proportion = n / sum(n)) %>%
  tidyr::pivot_wider(names_from = medical_specialty, values_from = proportion) %>%
  tidyr::pivot_longer(`Surgery`:`Radiology`,
               names_to = "medical_specialty", values_to = "proportion")
#  plot the relative proportions

ggplot2::ggplot(lemma.freq, ggplot2::aes(x=proportion, 
                                         y=`Orthopedic`,
                                         color=abs(`Orthopedic` - proportion))) + 
  ggplot2::geom_abline(color="gray40", lty=2) +
  ggplot2::geom_jitter(alpha=0.1, size=2.5, width=0.3, height=0.3) +
  ggplot2::geom_text(ggplot2::aes(label=lemma), check_overlap=TRUE, vjust=1.5) +
  ggplot2::scale_x_log10(labels=scales::percent_format()) + 
  ggplot2::scale_y_log10(labels=scales::percent_format()) + 
  ggplot2::scale_color_gradient(limits=c(0, 0.001), low="darkslategray4", high="gray75") +
  ggplot2::facet_wrap(~medical_specialty, ncol = 2) +
  ggplot2::theme(legend.position="none") +
  ggplot2:: labs(y="Orthopedic", x = NULL)
## Warning: Removed 314400 rows containing missing values (`geom_point()`).
## Warning: Removed 314400 rows containing missing values (`geom_text()`).

Q6) What does this plot tell you about the relative similarity of lemma frequencies between Surgery and Orthopedic and between Radiology and Orthopedic? Based on what these specialties involve, is this what you would expect?

These two plots compare the base-10 log-transformed relative proportion (%) of frequencies for each lemma existing in Orthopedic notes against Surgery and Radiology notes respectively. The more the number of terms distributed along the dashed diagonal line suggests a higher similarity of the lemmas used in the notes for both specialties.

Based on the comparison between Surgery and Orthopedic, the majority of shared lemmas are distributed above the diagonal line. This means surgery-required lemmas in orthopedics take up only a small part of all surgical operations, and most of these are related to fracture and ligament, which is intuitive and easily interpretable.

For comparison between Radiology and Orthopedic, the majority of shared lemmas are distributed under the diagonal line. This means radiological lemmas are less common than surgical lemmas in orthopedic clinics, and most common situations involve imaging. Somehow, it might be a bit out of expectation because imaging seems to be a common tool for inspecting the problems in musculoskeletal structure and monitoring recovery process.

Q7) Modify the above plotting code to do a direct comparison of Surgery and Radiology (i.e., have Surgery or Radiology on the Y-axis and the other 2 specialties as the X facets)

#  calculate the frequency of lemmas within each specialty and note
##   comparison on "Surgery" class

lemma.freq.2 <- lemmatized.data %>% 
  dplyr::count(medical_specialty, lemma) %>%
  dplyr::group_by(medical_specialty) %>% 
  dplyr::mutate(proportion = n / sum(n)) %>%
  tidyr::pivot_wider(names_from = medical_specialty, values_from = proportion) %>%
  tidyr::pivot_longer(`Orthopedic`:`Radiology`,
                      names_to = "medical_specialty", values_to = "proportion")

ggplot2::ggplot(lemma.freq.2, ggplot2::aes(x=proportion, 
                                           y=`Surgery`,
                                           color=abs(`Surgery` - proportion))) + 
  ggplot2::geom_abline(color="gray40", lty=2) +
  ggplot2::geom_jitter(alpha=0.1, size=2.5, width=0.3, height=0.3) +
  ggplot2::geom_text(ggplot2::aes(label=lemma), check_overlap=TRUE, vjust=1.5) +
  ggplot2::scale_x_log10(labels=scales::percent_format()) + 
  ggplot2::scale_y_log10(labels=scales::percent_format()) + 
  ggplot2::scale_color_gradient(limits=c(0, 0.001), low="darkslategray4", high="gray75") +
  ggplot2::facet_wrap(~medical_specialty, ncol = 2) +
  ggplot2::theme(legend.position="none") +
  ggplot2:: labs(y="Surgery", x = NULL)
## Warning: Removed 317813 rows containing missing values (`geom_point()`).
## Warning: Removed 317814 rows containing missing values (`geom_text()`).

#  calculate the frequency of lemmas within each specialty and note
##   comparison on "Radiology" class

lemma.freq.3 <- lemmatized.data %>% 
  dplyr::count(medical_specialty, lemma) %>%
  dplyr::group_by(medical_specialty) %>% 
  dplyr::mutate(proportion = n / sum(n)) %>%
  tidyr::pivot_wider(names_from = medical_specialty, values_from = proportion) %>%
  tidyr::pivot_longer(c(`Orthopedic`,`Surgery`),
                      names_to = "medical_specialty", values_to = "proportion")

ggplot2::ggplot(lemma.freq.3, ggplot2::aes(x=proportion, 
                                           y=`Radiology`,
                                           color=abs(`Radiology` - proportion))) + 
  ggplot2::geom_abline(color="gray40", lty=2) +
  ggplot2::geom_jitter(alpha=0.1, size=2.5, width=0.3, height=0.3) +
  ggplot2::geom_text(ggplot2::aes(label=lemma), check_overlap=TRUE, vjust=1.5) +
  ggplot2::scale_x_log10(labels=scales::percent_format()) + 
  ggplot2::scale_y_log10(labels=scales::percent_format()) + 
  ggplot2::scale_color_gradient(limits=c(0, 0.001), low="darkslategray4", high="gray75") +
  ggplot2::facet_wrap(~medical_specialty, ncol = 2) +
  ggplot2::theme(legend.position="none") +
  ggplot2:: labs(y="Radiology", x = NULL)
## Warning: Removed 343441 rows containing missing values (`geom_point()`).
## Warning: Removed 343442 rows containing missing values (`geom_text()`).

TF-IDF Normalisation

##  obtain counts of terms

lemma.counts <- lemmatized.data %>% dplyr::count(medical_specialty, lemma)
total.counts <- lemma.counts %>% 
                      dplyr::group_by(medical_specialty) %>% 
                      dplyr::summarise(total=sum(n))

all.counts <- dplyr::left_join(lemma.counts, total.counts)
## Joining with `by = join_by(medical_specialty)`
##  calculate the term frequency / invariant document frequency (tf-idf)

all.counts.tfidf <- tidytext::bind_tf_idf(all.counts, lemma, medical_specialty, n) 
##  print top 10 lemma by tf-idf within each specialty

all.counts.tfidf %>% dplyr::group_by(medical_specialty) %>% dplyr::slice_max(order_by=tf_idf, n=10)
## # A tibble: 30 × 7
## # Groups:   medical_specialty [3]
##    medical_specialty lemma                     n total       tf   idf   tf_idf
##    <chr>             <chr>                 <int> <int>    <dbl> <dbl>    <dbl>
##  1 Orthopedic        range motion            139 97774 0.00142  0.405 0.000576
##  2 Orthopedic        carpal ligament          85 97774 0.000869 0.405 0.000352
##  3 Orthopedic        transverse carpal        81 97774 0.000828 0.405 0.000336
##  4 Orthopedic        extremity prepped        79 97774 0.000808 0.405 0.000328
##  5 Orthopedic        proximal phalanx         75 97774 0.000767 0.405 0.000311
##  6 Orthopedic        department anesthesia    63 97774 0.000644 0.405 0.000261
##  7 Orthopedic        dissection carried       63 97774 0.000644 0.405 0.000261
##  8 Orthopedic        steri strips             59 97774 0.000603 0.405 0.000245
##  9 Orthopedic        closed vicryl            58 97774 0.000593 0.405 0.000241
## 10 Orthopedic        dressing applied         58 97774 0.000593 0.405 0.000241
## # ℹ 20 more rows

Q8) Are there any lemmas that stand out in these lists? Why or why not?

Generally, there is no common lemma that particularly has a significantly higher TF-IDF score than other lemmas in all 3 specialties, because the list of top lemmas in Radiology is quite different from the remaining two specialties. For Surgery and Orthopedics, “steri strips” and “closed vicryl” appear among the top 10 lemmas for both categories, as well as some bi-grams containing the word “anesthesia”.

These words are common tools or techniques that will be adopted when doing a surgery, involving those orthopedic-related surgery operations. From below, each of the two terms appear in around 200 transcriptions, and commonly mentioned in 60 notes.

##  Extract the full transcriptions from notes containing both terms "steri strips" and "closed vicryl"

a = analysis.data %>% dplyr::select(medical_specialty, transcription) %>% dplyr::filter(stringr::str_detect(transcription, 'steri strips'))
b = analysis.data %>% dplyr::select(medical_specialty, transcription) %>% dplyr::filter(stringr::str_detect(transcription, 'closed vicryl')) 

print(dim(a)[1])
## [1] 207
print(dim(b)[1])
## [1] 201
print(dim(subset(a, transcription %in% b$transcription))[1])
## [1] 60
##  Extract 1 example of the full transcriptions from notes containing the term "steri strips"

analysis.data %>% dplyr::select(medical_specialty, transcription) %>% dplyr::filter(stringr::str_detect(transcription, 'steri strips'))  %>% dplyr::slice(1)
## # A tibble: 1 × 2
##   medical_specialty transcription                                               
##   <chr>             <chr>                                                       
## 1 Surgery           preoperative diagnoses hallux rigidus left foot elevated me…

Q9) Extract an example of one of the other “top lemmas” by modifying the above code.

##  Extract 1 example of the full transcriptions from notes containing the term "closed vicryl"

analysis.data %>% dplyr::select(medical_specialty, transcription) %>% dplyr::filter(stringr::str_detect(transcription, 'closed vicryl'))  %>% dplyr::slice(1)
## # A tibble: 1 × 2
##   medical_specialty transcription                                               
##   <chr>             <chr>                                                       
## 1 Surgery           title operation placement ventriculoperitoneal vp shunts st…

Topic Modelling

First lets calculate a term frequency matrix for each transcription:

lemma.counts <- lemmatized.data %>% dplyr::count(note_id, lemma)
total.counts <- lemma.counts %>% 
                      dplyr::group_by(note_id) %>% 
                      dplyr::summarise(total=sum(n))

all.counts <- dplyr::left_join(lemma.counts, total.counts)
## Joining with `by = join_by(note_id)`
emr.dcm <- all.counts %>% tidytext::cast_dtm(note_id, lemma, n)

Then we can use LDA function to fit a 5 topic (k=5) LDA-model:

emr.lda <- topicmodels::LDA(emr.dcm, k=5, control=list(seed=42))
emr.topics <- tidytext::tidy(emr.lda, matrix='beta')

Then we can extract the top terms per assigned topic:

top.terms <- emr.topics %>% dplyr::group_by(topic) %>% 
  dplyr::slice_max(beta, n=10) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(topic, -beta)


top.terms %>% 
  dplyr::mutate(term=tidytext::reorder_within(term, beta, topic)) %>% 
  ggplot2::ggplot(ggplot2::aes(beta, term, fill=factor(topic))) + 
    ggplot2::geom_col(show.legend=FALSE) + 
    ggplot2::facet_wrap(~ topic, scales='free')  +
    ggplot2::theme(axis.text.x = element_text(angle = 45,vjust = 1,hjust = 1)) +
    tidytext::scale_y_reordered()

specialty_gamma <- tidytext::tidy(emr.lda, matrix='gamma')

# we need to join in the specialty from the note_id
note_id_specialty_mapping <- lemmatized.data %>%
  dplyr::mutate(document=as.character(note_id)) %>% 
  dplyr::select(document, medical_specialty) %>% 
  dplyr::distinct()

specialty_gamma <- dplyr::left_join(specialty_gamma, note_id_specialty_mapping)
## Joining with `by = join_by(document)`
specialty_gamma %>%
  dplyr::mutate(medical_specialty = reorder(medical_specialty, gamma * topic)) %>%
  ggplot2::ggplot(ggplot2::aes(factor(topic), gamma)) +
  ggplot2::geom_boxplot() +
  ggplot2::facet_wrap(~ medical_specialty) +
  ggplot2::labs(x = "topic", y = expression(gamma))

Q10) Repeat this with a 6 topic LDA, do the top terms from the 5 topic LDA still turn up? How do the specialties get split into sub-topics?

Comparing the 6-topic LDA model and the 5-topic LDA model:

  1. The 1st and 3rd topics of both LDA models are basically the same, just the rankings of the top 10 terms have some differences;

  2. The 2nd and 4th topics of both LDA models should be relevant, but some new terms enter the top 10 for the 6-topic LDA model, for example, “proximal phalanx”, “ankle tourniquet”, “metatarsal head” for the 2nd topic, as well as “central canal”, “lower extremity” for the 4th topic.

    This shows the 6-topic LDA model can capture more specific themes or entities for these two topics, while the 5-topic LDA model repeats some high-frequency terms that also appear in other topics, like “toleratedd procedure”, “patient tolerated”.

  3. The 5th topic in both models are similar, just the 6-topic LDA model identifies “vicryl suture” that is missing in the 5-topic LDA model.

  4. The new 6th topic is left body related, with terms like “left main” and “left anterior” regrouped to this topic.

There is a great change in the topic distribution of each specialty. In the 5-topic LDA model, each specialty has a dominant topic (Orthopedics: topic 2, Surgery: topic 3, Radiology: topic 4). However, in the 6-topic LDA model, Orthopedics and Surgery do not show a dependence on a particular topic. While topic 4 is still the major topic for Radiology, its sub-topic changes from topic 1 to the newly formed topic 6.

This observation may be due to the algorithm of LDA, because LDA is a probabilistic model that each note is assumed to be containing a mixture of topics (document-topic distribution) and a topic is made of a mixture of terms (topic-term distribution). The probabilistic distribution is guided by the inputted TF-IDF frequency matrix. Some frequent terms appearing in a majority of notes of the three specialties will have a higher probability of being drawn, which explains why they are top-ranked in each topic. When all topics have some extent of similarities, the document overall will tend to have a relatively equal distribution of topics.

Adding 1 topic may have increased the complexity and perplexity of the model. Although some topics (e.g., 2nd and 4th) become more specific, the topic probabilities for each note may become less variant. Upon aggregation by the three specialties, the topic contribution becomes more even.

D. Buenaño-Fernandez, M. González, D. Gil and S. Luján-Mora, (2020). “Text Mining of Open-Ended Questions in Self-Assessment of University Teachers: An LDA Topic Modeling Approach,” in IEEE Access, vol. 8, pp. 35318-35330, doi: 10.1109/ACCESS.2020.2974983.

Redo LDA function to fit a 6 topic LDA-model:

emr.lda.2 <- topicmodels::LDA(emr.dcm, k=6, control=list(seed=42))
emr.topics.2 <- tidytext::tidy(emr.lda.2, matrix='beta')

Extract the top terms per assigned topic:

top.terms.2 <- emr.topics.2 %>% dplyr::group_by(topic) %>% 
  dplyr::slice_max(beta, n=10) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(topic, -beta)


top.terms.2 %>% 
  dplyr::mutate(term=tidytext::reorder_within(term, beta, topic)) %>% 
  ggplot2::ggplot(ggplot2::aes(beta, term, fill=factor(topic))) + 
    ggplot2::geom_col(show.legend=FALSE) + 
    ggplot2::facet_wrap(~ topic, scales='free')  +
    ggplot2::theme(axis.text.x = element_text(angle = 45,vjust = 1,hjust = 1)) +
    tidytext::scale_y_reordered()

specialty_gamma.2 <- tidytext::tidy(emr.lda.2, matrix='gamma')

# we need to join in the specialty from the note_id
note_id_specialty_mapping.2 <- lemmatized.data %>%
  dplyr::mutate(document=as.character(note_id)) %>% 
  dplyr::select(document, medical_specialty) %>% 
  dplyr::distinct()

specialty_gamma.2 <- dplyr::left_join(specialty_gamma.2, note_id_specialty_mapping.2)
## Joining with `by = join_by(document)`
specialty_gamma.2 %>%
  dplyr::mutate(medical_specialty = reorder(medical_specialty, gamma * topic)) %>%
  ggplot2::ggplot(ggplot2::aes(factor(topic), gamma)) +
  ggplot2::geom_boxplot() +
  ggplot2::facet_wrap(~ medical_specialty) +
  ggplot2::labs(x = "topic", y = expression(gamma))

>>>>>>> 9f96bfb281b737e089ba40b7464dd65f95c7bb64